set more off
clear all
set maxvar 100000

set seed 20240549

* Modify the following line to choose the working directory
*cd ""

cap log close
log using "Fig1.txt", text replace

* White Male with at least bachelor's degree residing in California
use ACS2018_CA, clear	

* hourly wages

gen hw = incwage/(uhrswork*52)
gen lhw = ln(hw)

drop if lhw == .

sum lhw

* group means

collapse lhw, by(age educd degfieldd)

sum lhw

* regression models

local m1 = "i.educ i.age i.degfieldd"
local m2 = "i.educd##i.age i.degfieldd"
local m3 = "i.educd##i.degfieldd i.age"
local m4 = "i.educd##i.age i.educd##i.degfieldd"
local m5 = "i.educd##i.age i.age##i.degfieldd"
local m6 = "i.educ##i.degfieldd i.age##i.degfieldd"
local m7 = "i.educ##i.age##i.degfieldd"

local ld0 = 1e-7

splitsample, generate(sample) split(0.2 0.8)

label define svalues 1 "Training" 2 "Testing"
label values sample svalues

tab sample, matcell(tmp)
local n_train = tmp[1,1]
local n_test = tmp[2,1]

* Add noise to the train sample
* normal with mean zero and standard deviation 0.5

gen e_tmp1 = rnormal(0, 0.5)
gen e_tmp2 = rnormal(0, 0.5)
bysort degfieldd: egen e_c = mean(e_tmp2)

* change values for c_e
forvalues j=0(25)75 {
	
egen e_cluster_`j' = std((1-(`j'/100))*e_tmp1+(`j'/100)*e_c)
replace e_cluster_`j' = e_cluster_`j'/2 

gen lhw_cluster_`j' = lhw
replace lhw_cluster_`j' = lhw + e_cluster_`j' if sample == 1

	forvalues i=1/7 {
	
		quietly elasticnet linear lhw_cluster_`j' `m`i'' ///
			   if sample == 1, alpha(0) selection(none) grid(1,min(`ld0'))
		lassoselect alpha = 0 lambda = `ld0'			
		estimates store ridgeless`i'
		scalar p`i' = e(k_allvars)
	
	}

lassogof ridgeless1 ridgeless2 ridgeless3 ridgeless4 ///
		 ridgeless5 ridgeless6 ridgeless7, over(sample)

matrix res = r(table)
matrix mse = res[1..14,1]
matrix rsq = res[1..14,2]		

matrix mse_in = res[1,1]\res[3,1]\res[5,1]\res[7,1]\res[9,1] \res[11,1]\res[13,1]
matrix mse_out= res[2,1]\res[4,1]\res[6,1]\res[8,1]\res[10,1]\res[12,1]\res[14,1]

matrix pm = (p1, p2, p3, p4, p5, p6, p7)'

matrix results = (pm, mse_in, mse_out)

matrix colnames results = p train_error test_error
matrix rownames results = spec1 spec2 spec3 spec4 spec5 spec6 spec7


matrix list results
		 
mat2txt, matrix(results) saving("results_`j'") replace ///
	title(in-sample vs. out-of-sample MSE with c=`j'/100)  

}	
		 
cap log close
 
exit

